In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
In [5]:
diabetes=pd.read_csv(r"E:\Ankit Jain\D drive\Aviraj Personal File\IMS Analytics Class\Github sets\Diabetes Analysis\diabetes.csv")
In [6]:
diabetes.head()
Out[6]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2.288 33 1
In [7]:
diabetes.rename({"DiabetesPedigreeFunction":"Pedigree"},axis=1,inplace=True)
In [8]:
diabetes.isnull().sum()
Out[8]:
Pregnancies      0
Glucose          0
BloodPressure    0
SkinThickness    0
Insulin          0
BMI              0
Pedigree         0
Age              0
Outcome          0
dtype: int64
In [9]:
diabetes.describe()
Out[9]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI Pedigree Age Outcome
count 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000
mean 3.845052 120.894531 69.105469 20.536458 79.799479 31.992578 0.471876 33.240885 0.348958
std 3.369578 31.972618 19.355807 15.952218 115.244002 7.884160 0.331329 11.760232 0.476951
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.078000 21.000000 0.000000
25% 1.000000 99.000000 62.000000 0.000000 0.000000 27.300000 0.243750 24.000000 0.000000
50% 3.000000 117.000000 72.000000 23.000000 30.500000 32.000000 0.372500 29.000000 0.000000
75% 6.000000 140.250000 80.000000 32.000000 127.250000 36.600000 0.626250 41.000000 1.000000
max 17.000000 199.000000 122.000000 99.000000 846.000000 67.100000 2.420000 81.000000 1.000000
In [10]:
diabetes.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Pregnancies    768 non-null    int64  
 1   Glucose        768 non-null    int64  
 2   BloodPressure  768 non-null    int64  
 3   SkinThickness  768 non-null    int64  
 4   Insulin        768 non-null    int64  
 5   BMI            768 non-null    float64
 6   Pedigree       768 non-null    float64
 7   Age            768 non-null    int64  
 8   Outcome        768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
In [11]:
diabetes.columns
Out[11]:
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'Pedigree', 'Age', 'Outcome'],
      dtype='object')
In [12]:
for column in diabetes:
    print(diabetes[column].value_counts())
    print()
1     135
0     111
2     103
3      75
4      68
5      57
6      50
7      45
8      38
9      28
10     24
11     11
13     10
12      9
14      2
15      1
17      1
Name: Pregnancies, dtype: int64

100    17
99     17
129    14
125    14
111    14
       ..
177     1
172     1
169     1
160     1
199     1
Name: Glucose, Length: 136, dtype: int64

70     57
74     52
68     45
78     45
72     44
64     43
80     40
76     39
60     37
0      35
62     34
66     30
82     30
88     25
84     23
90     22
86     21
58     21
50     13
56     12
52     11
54     11
92      8
75      8
65      7
94      6
85      6
48      5
44      4
96      4
110     3
100     3
98      3
106     3
108     2
104     2
30      2
55      2
46      2
40      1
38      1
24      1
95      1
61      1
102     1
114     1
122     1
Name: BloodPressure, dtype: int64

0     227
32     31
30     27
27     23
23     22
33     20
18     20
28     20
31     19
39     18
19     18
29     17
37     16
26     16
22     16
40     16
25     16
35     15
41     15
36     14
15     14
17     14
20     13
24     12
42     11
13     11
21     10
34      8
46      8
38      7
12      7
14      6
16      6
11      6
43      6
45      6
10      5
44      5
48      4
47      4
50      3
49      3
54      2
52      2
7       2
8       2
60      1
56      1
63      1
51      1
99      1
Name: SkinThickness, dtype: int64

0      374
105     11
140      9
130      9
120      8
      ... 
271      1
270      1
108      1
112      1
846      1
Name: Insulin, Length: 186, dtype: int64

32.0    13
31.6    12
31.2    12
0.0     11
33.3    10
        ..
32.1     1
52.9     1
31.3     1
45.7     1
42.8     1
Name: BMI, Length: 248, dtype: int64

0.254    6
0.258    6
0.259    5
0.238    5
0.207    5
        ..
0.886    1
0.804    1
1.251    1
0.382    1
0.375    1
Name: Pedigree, Length: 517, dtype: int64

22    72
21    63
25    48
24    46
23    38
28    35
26    33
27    32
29    29
31    24
41    22
30    21
37    19
42    18
33    17
32    16
36    16
38    16
45    15
34    14
40    13
43    13
46    13
39    12
35    10
50     8
44     8
51     8
52     8
58     7
47     6
54     6
57     5
60     5
48     5
49     5
53     5
55     4
62     4
63     4
66     4
56     3
59     3
65     3
67     3
61     2
69     2
72     1
64     1
68     1
70     1
81     1
Name: Age, dtype: int64

0    500
1    268
Name: Outcome, dtype: int64

In [13]:
sns.pairplot(diabetes)
Out[13]:
<seaborn.axisgrid.PairGrid at 0x1ce82f93550>
In [14]:
plt.figure(figsize=(15,5))
sns.boxplot(data=diabetes)
Out[14]:
<AxesSubplot:>
In [15]:
diabetes.corr()
Out[15]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI Pedigree Age Outcome
Pregnancies 1.000000 0.129459 0.141282 -0.081672 -0.073535 0.017683 -0.033523 0.544341 0.221898
Glucose 0.129459 1.000000 0.152590 0.057328 0.331357 0.221071 0.137337 0.263514 0.466581
BloodPressure 0.141282 0.152590 1.000000 0.207371 0.088933 0.281805 0.041265 0.239528 0.065068
SkinThickness -0.081672 0.057328 0.207371 1.000000 0.436783 0.392573 0.183928 -0.113970 0.074752
Insulin -0.073535 0.331357 0.088933 0.436783 1.000000 0.197859 0.185071 -0.042163 0.130548
BMI 0.017683 0.221071 0.281805 0.392573 0.197859 1.000000 0.140647 0.036242 0.292695
Pedigree -0.033523 0.137337 0.041265 0.183928 0.185071 0.140647 1.000000 0.033561 0.173844
Age 0.544341 0.263514 0.239528 -0.113970 -0.042163 0.036242 0.033561 1.000000 0.238356
Outcome 0.221898 0.466581 0.065068 0.074752 0.130548 0.292695 0.173844 0.238356 1.000000
In [16]:
plt.figure(figsize=(15,5),frameon=False)
sns.heatmap(diabetes.corr(),annot=True)
Out[16]:
<AxesSubplot:>
In [17]:
import plotly.express as px
px.scatter(diabetes,x="Insulin",y="BMI")
In [18]:
px.scatter(diabetes,x="SkinThickness",y="Pedigree")
In [19]:
diabetes.head(3)
Out[19]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI Pedigree Age Outcome
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
In [20]:
pd.crosstab(diabetes["Outcome"],diabetes["Pregnancies"],margins=True)
Out[20]:
Pregnancies 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 17 All
Outcome
0 73 106 84 48 45 36 34 20 16 10 14 4 5 5 0 0 0 500
1 38 29 19 27 23 21 16 25 22 18 10 7 4 5 2 1 1 268
All 111 135 103 75 68 57 50 45 38 28 24 11 9 10 2 1 1 768
In [21]:
diabetes["Age"].min(),diabetes["Age"].max()
Out[21]:
(21, 81)
In [22]:
age=pd.cut(diabetes["Age"],[20,30,40,50,60,70,80,90])
diabetes.pivot_table(values=["Pregnancies","Glucose","BloodPressure","SkinThickness","Insulin"],
                     index=["Outcome",age],aggfunc=sum,margins=True,observed=True)
Out[22]:
BloodPressure Glucose Insulin Pregnancies SkinThickness
Outcome Age
0 (20, 30] 21486 34982 24900 636 6819
(30, 40] 5746 9246 5875 428 1628
(40, 50] 3606 5292 2165 320 953
(50, 60] 1816 2865 1026 156 218
(60, 70] 1364 2352 370 98 181
(70, 80] 0 119 0 2 0
(80, 90] 74 134 60 9 33
1 (20, 30] 5754 12629 10256 201 2270
(30, 40] 5287 10564 6168 400 1597
(40, 50] 4845 8820 4646 485 1238
(50, 60] 2507 4757 5820 196 718
(60, 70] 588 1087 0 22 117
All 53073 92847 61286 2953 15772
In [23]:
diabetes.pivot_table(values=["Glucose","BloodPressure","SkinThickness","Insulin"],
                     index= ["Outcome",age],aggfunc="sum").plot()
Out[23]:
<AxesSubplot:xlabel='Outcome,Age'>
In [24]:
x=diabetes.iloc[ : ,0:8]
y=diabetes.Outcome
In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
logreg=LogisticRegression()
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=.25,random_state=3)
In [26]:
logreg.fit(xtrain,ytrain)
Out[26]:
LogisticRegression()
In [27]:
ypred=logreg.predict(xtest)
ypred
Out[27]:
array([0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0], dtype=int64)
In [28]:
pd.DataFrame({"Actual": ytest[0:20],"Predicted" :ypred[0:20]})
Out[28]:
Actual Predicted
51 0 0
378 1 1
27 0 0
579 1 1
86 0 0
144 0 0
65 0 0
617 0 0
10 0 0
522 0 0
535 1 0
123 0 0
389 0 0
399 1 1
270 1 1
81 0 0
80 0 0
450 0 0
693 1 1
648 1 0
In [29]:
metrics.confusion_matrix(ytest,ypred)
Out[29]:
array([[96, 16],
       [36, 44]], dtype=int64)
In [30]:
round((96+44)/(96+16+44+36)*100)
Out[30]:
73
In [31]:
print(metrics.classification_report(ytest,ypred))
              precision    recall  f1-score   support

           0       0.73      0.86      0.79       112
           1       0.73      0.55      0.63        80

    accuracy                           0.73       192
   macro avg       0.73      0.70      0.71       192
weighted avg       0.73      0.73      0.72       192

In [32]:
import statsmodels.api as sma
logmodel=sma.Logit(y,x)
result=logmodel.fit()
print(result.summary2())
Optimization terminated successfully.
         Current function value: 0.608498
         Iterations 5
                         Results: Logit
=================================================================
Model:              Logit            Pseudo R-squared: 0.059     
Dependent Variable: Outcome          AIC:              950.6528  
Date:               2021-06-19 15:00 BIC:              987.8031  
No. Observations:   768              Log-Likelihood:   -467.33   
Df Model:           7                LL-Null:          -496.74   
Df Residuals:       760              LLR p-value:      2.5825e-10
Converged:          1.0000           Scale:            1.0000    
No. Iterations:     5.0000                                       
-----------------------------------------------------------------
                   Coef.  Std.Err.    z    P>|z|   [0.025  0.975]
-----------------------------------------------------------------
Pregnancies        0.1284   0.0286  4.4843 0.0000  0.0723  0.1845
Glucose            0.0129   0.0027  4.7568 0.0000  0.0076  0.0183
BloodPressure     -0.0303   0.0047 -6.4806 0.0000 -0.0395 -0.0212
SkinThickness      0.0002   0.0061  0.0323 0.9742 -0.0117  0.0121
Insulin            0.0007   0.0008  0.9420 0.3462 -0.0008  0.0023
BMI               -0.0048   0.0107 -0.4494 0.6531 -0.0258  0.0162
Pedigree           0.3203   0.2399  1.3351 0.1818 -0.1499  0.7905
Age               -0.0156   0.0084 -1.8517 0.0641 -0.0322  0.0009
=================================================================

In [33]:
print(round(metrics.accuracy_score(ytest,ypred)*100))
print(round(metrics.precision_score(ytest,ypred)*100))
print(round(metrics.recall_score(ytest,ypred)*100))
73
73
55
In [ ]: